HEAD
The packages used in this project are: Rio: Chan et al. (2021) Readr: Wickham and Hester (2021) Haven: Wickham and Miller (2021)
dat <- import(here("data", "dat.csv")) %>%
filter(Category=="Passenger") %>%
clean_names() %>%
mutate_all(na_if,"") %>%
drop_na(survived, gender, class, age)
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
# Breakdown of passengers by class
dat %>%
group_by(class) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
adorn_totals() %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Count | Percent |
|---|---|---|
| 1st Class | 324 | 24.64 |
| 2nd Class | 284 | 21.60 |
| 3rd Class | 707 | 53.76 |
| Total | 1315 | 100.00 |
# Breakdown of passengers by class and gender
dat %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Count | Percent |
|---|---|---|---|
| 1st Class | Female | 144 | 44.44 |
| 1st Class | Male | 180 | 55.56 |
| 2nd Class | Female | 106 | 37.32 |
| 2nd Class | Male | 178 | 62.68 |
| 3rd Class | Female | 216 | 30.55 |
| 3rd Class | Male | 491 | 69.45 |
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Nationality | Count | Percent |
|---|---|---|
| English | 295 | 22.43 |
| American | 242 | 18.40 |
| Irish | 122 | 9.28 |
| Other - Multiple | 108 | 8.21 |
| Swedish | 99 | 7.53 |
| Syrian/Lebanese | 85 | 6.46 |
| Finnish | 58 | 4.41 |
| Canadian | 37 | 2.81 |
| Bulgarian | 31 | 2.36 |
| Croatian | 28 | 2.13 |
| French | 26 | 1.98 |
| Norwegian | 26 | 1.98 |
| Belgian | 25 | 1.90 |
| Scottish | 17 | 1.29 |
| Channel Islander | 15 | 1.14 |
| Swiss | 13 | 0.99 |
| Danish | 10 | 0.76 |
| Italian | 9 | 0.68 |
| German | 8 | 0.61 |
| Spanish | 8 | 0.61 |
| Welsh | 8 | 0.61 |
| Polish | 6 | 0.46 |
| Bosnian | 4 | 0.30 |
| Hong Kongese | 4 | 0.30 |
| South African | 4 | 0.30 |
| Greek | 3 | 0.23 |
| Lithuanian | 3 | 0.23 |
| Uruguayan | 3 | 0.23 |
| Australian | 2 | 0.15 |
| Chinese | 2 | 0.15 |
| Portuguese | 2 | 0.15 |
| Slovenian | 2 | 0.15 |
| Austrian | 1 | 0.08 |
| Dutch | 1 | 0.08 |
| Egyptian | 1 | 0.08 |
| Haitian | 1 | 0.08 |
| Hungarian | 1 | 0.08 |
| Japanese | 1 | 0.08 |
| Latvian | 1 | 0.08 |
| Mexican | 1 | 0.08 |
| Sweedish | 1 | 0.08 |
| Turkish | 1 | 0.08 |
# Breakdown of passenger nationalities by class
dat %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Nationality | Count | Percent |
|---|---|---|---|
| 1st Class | American | 195 | 60.19 |
| 1st Class | English | 38 | 11.73 |
| 1st Class | Canadian | 27 | 8.33 |
| 1st Class | Other - Multiple | 14 | 4.32 |
| 1st Class | French | 10 | 3.09 |
| 1st Class | Swiss | 6 | 1.85 |
| 1st Class | German | 5 | 1.54 |
| 1st Class | Irish | 5 | 1.54 |
| 1st Class | Spanish | 4 | 1.23 |
| 1st Class | Swedish | 4 | 1.23 |
| 1st Class | Scottish | 3 | 0.93 |
| 1st Class | Uruguayan | 3 | 0.93 |
| 1st Class | Belgian | 2 | 0.62 |
| 1st Class | Italian | 2 | 0.62 |
| 1st Class | Channel Islander | 1 | 0.31 |
| 1st Class | Dutch | 1 | 0.31 |
| 1st Class | Egyptian | 1 | 0.31 |
| 1st Class | Mexican | 1 | 0.31 |
| 1st Class | Norwegian | 1 | 0.31 |
| 1st Class | Polish | 1 | 0.31 |
| 2nd Class | English | 145 | 51.06 |
| 2nd Class | Other - Multiple | 25 | 8.80 |
| 2nd Class | American | 24 | 8.45 |
| 2nd Class | Channel Islander | 12 | 4.23 |
| 2nd Class | Irish | 12 | 4.23 |
| 2nd Class | French | 11 | 3.87 |
| 2nd Class | Scottish | 8 | 2.82 |
| 2nd Class | Finnish | 6 | 2.11 |
| 2nd Class | Swedish | 6 | 2.11 |
| 2nd Class | Canadian | 5 | 1.76 |
| 2nd Class | South African | 4 | 1.41 |
| 2nd Class | Spanish | 4 | 1.41 |
| 2nd Class | Danish | 3 | 1.06 |
| 2nd Class | Italian | 3 | 1.06 |
| 2nd Class | Lithuanian | 2 | 0.70 |
| 2nd Class | Swiss | 2 | 0.70 |
| 2nd Class | Syrian/Lebanese | 2 | 0.70 |
| 2nd Class | Welsh | 2 | 0.70 |
| 2nd Class | Australian | 1 | 0.35 |
| 2nd Class | Belgian | 1 | 0.35 |
| 2nd Class | German | 1 | 0.35 |
| 2nd Class | Haitian | 1 | 0.35 |
| 2nd Class | Hungarian | 1 | 0.35 |
| 2nd Class | Japanese | 1 | 0.35 |
| 2nd Class | Norwegian | 1 | 0.35 |
| 2nd Class | Portuguese | 1 | 0.35 |
| 3rd Class | English | 112 | 15.84 |
| 3rd Class | Irish | 105 | 14.85 |
| 3rd Class | Swedish | 89 | 12.59 |
| 3rd Class | Syrian/Lebanese | 83 | 11.74 |
| 3rd Class | Other - Multiple | 69 | 9.76 |
| 3rd Class | Finnish | 52 | 7.36 |
| 3rd Class | Bulgarian | 31 | 4.38 |
| 3rd Class | Croatian | 28 | 3.96 |
| 3rd Class | Norwegian | 24 | 3.39 |
| 3rd Class | American | 23 | 3.25 |
| 3rd Class | Belgian | 22 | 3.11 |
| 3rd Class | Danish | 7 | 0.99 |
| 3rd Class | Scottish | 6 | 0.85 |
| 3rd Class | Welsh | 6 | 0.85 |
| 3rd Class | Canadian | 5 | 0.71 |
| 3rd Class | French | 5 | 0.71 |
| 3rd Class | Polish | 5 | 0.71 |
| 3rd Class | Swiss | 5 | 0.71 |
| 3rd Class | Bosnian | 4 | 0.57 |
| 3rd Class | Hong Kongese | 4 | 0.57 |
| 3rd Class | Italian | 4 | 0.57 |
| 3rd Class | Greek | 3 | 0.42 |
| 3rd Class | Channel Islander | 2 | 0.28 |
| 3rd Class | Chinese | 2 | 0.28 |
| 3rd Class | German | 2 | 0.28 |
| 3rd Class | Slovenian | 2 | 0.28 |
| 3rd Class | Australian | 1 | 0.14 |
| 3rd Class | Austrian | 1 | 0.14 |
| 3rd Class | Latvian | 1 | 0.14 |
| 3rd Class | Lithuanian | 1 | 0.14 |
| 3rd Class | Portuguese | 1 | 0.14 |
| 3rd Class | Sweedish | 1 | 0.14 |
| 3rd Class | Turkish | 1 | 0.14 |
# Average age by class
dat %>%
group_by(class) %>%
summarize(avg_age = mean(age), std_age = sd(age), min_age = min(age),
max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "SD Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Average Age | SD Age | Minimum Age | Maximum Age |
|---|---|---|---|---|
| 1st Class | 39.14 | 13.55 | 0 | 71 |
| 2nd Class | 30.01 | 13.90 | 0 | 71 |
| 3rd Class | 25.12 | 11.71 | 0 | 74 |
# Survival rate by class
dat %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived) %>%
kable(caption = "Survival Rate by Class",
col.names = c("Class", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Survived | Count | Percent |
|---|---|---|---|
| 1st Class | Lost | 123 | 37.96 |
| 1st Class | Saved | 201 | 62.04 |
| 2nd Class | Lost | 166 | 58.45 |
| 2nd Class | Saved | 118 | 41.55 |
| 3rd Class | Lost | 526 | 74.40 |
| 3rd Class | Saved | 181 | 25.60 |
# Survival rate by gender
dat %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived) %>%
kable(caption = "Survival Rate by Gender",
col.names = c("Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Gender | Survived | Count | Percent |
|---|---|---|---|
| Female | Lost | 127 | 27.25 |
| Female | Saved | 339 | 72.75 |
| Male | Lost | 688 | 81.04 |
| Male | Saved | 161 | 18.96 |
# Survival rate by class and gender
dat %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender) %>%
kable(caption = "Survival Rate by Class and Gender",
col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Survived | Count | Percent |
|---|---|---|---|---|
| 1st Class | Female | Lost | 5 | 3.47 |
| 1st Class | Female | Saved | 139 | 96.53 |
| 1st Class | Male | Lost | 118 | 65.56 |
| 1st Class | Male | Saved | 62 | 34.44 |
| 2nd Class | Female | Lost | 12 | 11.32 |
| 2nd Class | Female | Saved | 94 | 88.68 |
| 2nd Class | Male | Lost | 154 | 86.52 |
| 2nd Class | Male | Saved | 24 | 13.48 |
| 3rd Class | Female | Lost | 110 | 50.93 |
| 3rd Class | Female | Saved | 106 | 49.07 |
| 3rd Class | Male | Lost | 416 | 84.73 |
| 3rd Class | Male | Saved | 75 | 15.27 |
surv_classhist <- dat %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_agehist <- dat %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_ageclass_hist <- dat %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
modeldat <- dat %>%
select(survived, gender, class, age)
ctree <- ctree(survived ~ gender + class + age, data=modeldat)
plot(ctree)
fares <- import(here("data", "avgfare.csv")) %>%
clean_names()
fares$accommodation <- as.factor(fares$accommodation)
fares$accommodation <- factor(fares$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))
p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)
p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)
p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)
p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)
p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)
p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)
p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)
p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)
p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)
p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)
p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)
fares_tidy <- fares %>%
pivot_longer(cols = starts_with("fare"),
names_to = "year",
names_prefix = "fare_",
values_to = "price", names_transform = list(year = as.integer))
fare_graph <- fares_tidy %>%
ggplot(aes(year, price, colour=accommodation)) +
geom_line() +
geom_point() +
scale_colour_brewer(palette="Spectral") +
facet_wrap(~ accommodation, 4, scales = "free") +
xlim(1912,2021) +
theme(panel.spacing = unit(1, "lines")) +
labs(y = "Price ($USD)", x = "Year", title = "Inflation-Adjusted Titanic Ticket Prices",
subtitle = "From 1912 to 2021", colour = "Accommodation") +
theme_minimal()
ggplotly(fare_graph)
When taking inflation rates into consideration, we see that the average price for a first class cabin in 1912 was $150.00, which today would be $4,241.74
Chan, Chung-hong, Geoffrey CH Chan, Thomas J. Leeper, and Jason Becker. 2021. Rio: A Swiss-Army Knife for Data File I/O.
Wickham, Hadley, and Jim Hester. 2021. Readr: Read Rectangular Text Data. https://CRAN.R-project.org/package=readr.
Wickham, Hadley, and Evan Miller. 2021. Haven: Import and Export ’Spss’, ’Stata’ and ’Sas’ Files. https://CRAN.R-project.org/package=haven.
The packages used in this project are: Rio: Chan et al. (2021) Readr: Wickham and Hester (2021) Haven: Wickham and Miller (2021)
dat <- import(here("data", "dat.csv")) %>%
clean_names() %>%
mutate_all(na_if,"")
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$survived <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
# # PEER Review MV: You could consider using dplyr to recode these new variables and then also creating factors that make more sense for some of the variables that have multiple variables. For example, numbering factors based on frequency for marital status. This probably isn't very helpful but a little cleaner than code above.
# dat <- dat %>%
# mutate(gender = as.factor(gender),
# marital_status = fct_infreq(marital_status),
# category = as.factor(category),
# class = as.factor(class),
# survived = as.factor(survived),
# embarked = as.factor(embarked),
# disembarked = as.factor(disembarked))
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Swedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
## PEER Review MV: For Nationality, Consider creating a more collapsed factor variable that only has those nationalities with 10 or more individuals and then an other category. Then you could potentially create a cleaner bar graph. Added some code below
# dat <- dat %>%
# mutate(nationality_cat = fct_lump_min(nationality,10),
# nationality_cat = fct_infreq(nationality_cat))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
datpass <- dat %>%
filter(category=="Passenger") %>%
select(survived, gender, class, age) %>%
na.omit()
# Breakdown of passengers by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(gender)) %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Count | Percent |
|---|---|---|---|
| 1st Class | Female | 153 | 43.71 |
| 1st Class | Male | 197 | 56.29 |
| 2nd Class | Female | 112 | 38.36 |
| 2nd Class | Male | 180 | 61.64 |
| 3rd Class | Female | 216 | 30.47 |
| 3rd Class | Male | 493 | 69.53 |
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Nationality | Count | Percent |
|---|---|---|
| English | 1037 | 42.36 |
| Irish | 361 | 14.75 |
| American | 246 | 10.05 |
| Other - Multiple | 116 | 4.74 |
| Swedish | 100 | 4.08 |
| Syrian/Lebanese | 86 | 3.51 |
| Finnish | 58 | 2.37 |
| Scottish | 49 | 2.00 |
| French | 44 | 1.80 |
| Italian | 41 | 1.67 |
| Canadian | 39 | 1.59 |
| Bulgarian | 33 | 1.35 |
| Croatian | 28 | 1.14 |
| Belgian | 26 | 1.06 |
| Norwegian | 26 | 1.06 |
| Channel Islander | 25 | 1.02 |
| Welsh | 23 | 0.94 |
| Swiss | 22 | 0.90 |
| German | 14 | 0.57 |
| Danish | 11 | 0.45 |
| Spanish | 9 | 0.37 |
| Australian | 7 | 0.29 |
| Polish | 6 | 0.25 |
| South African | 5 | 0.20 |
| Bosnian | 4 | 0.16 |
| Hong Kongese | 4 | 0.16 |
| Dutch | 3 | 0.12 |
| Greek | 3 | 0.12 |
| Lithuanian | 3 | 0.12 |
| Uruguayan | 3 | 0.12 |
| Chinese | 2 | 0.08 |
| Portuguese | 2 | 0.08 |
| Slovenian | 2 | 0.08 |
| Austrian | 1 | 0.04 |
| Cape Verdean | 1 | 0.04 |
| Egyptian | 1 | 0.04 |
| Guyanese | 1 | 0.04 |
| Haitian | 1 | 0.04 |
| Hungarian | 1 | 0.04 |
| Japanese | 1 | 0.04 |
| Latvian | 1 | 0.04 |
| Mexican | 1 | 0.04 |
| Turkish | 1 | 0.04 |
# # PEER Review MV: - Consider visualizing some of your data that is currently in tables into bar graphs. This one doesn't have all the info on the table, but easier to see which passengers were most represented
# dat %>%
# filter(!is.na(nationality2)) %>%
# group_by(nationality_cat) %>%
# summarize(count = n()) %>%
# mutate(percent = (count/sum(count))*100) %>%
# arrange((percent)) %>%
# ggplot(aes(y = nationality_cat)) +
# geom_col(aes(x = percent), fill = "dark red") +
# geom_text(aes(x = percent, label=round(percent,2)), hjust = -.2, size = 3) +
# theme_minimal() +
# labs(x = "Percentage of all passenges", y = "Nationality")
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Nationality | Count | Percent |
|---|---|---|---|
| 1st Class | American | 195 | 57.35 |
| 1st Class | English | 51 | 15.00 |
| 1st Class | Canadian | 27 | 7.94 |
| 1st Class | Other - Multiple | 14 | 4.12 |
| 1st Class | French | 10 | 2.94 |
| 1st Class | Irish | 6 | 1.76 |
| 1st Class | Swiss | 6 | 1.76 |
| 1st Class | German | 5 | 1.47 |
| 1st Class | Scottish | 5 | 1.47 |
| 1st Class | Spanish | 4 | 1.18 |
| 1st Class | Swedish | 4 | 1.18 |
| 1st Class | Uruguayan | 3 | 0.88 |
| 1st Class | Belgian | 2 | 0.59 |
| 1st Class | Italian | 2 | 0.59 |
| 1st Class | Channel Islander | 1 | 0.29 |
| 1st Class | Dutch | 1 | 0.29 |
| 1st Class | Egyptian | 1 | 0.29 |
| 1st Class | Mexican | 1 | 0.29 |
| 1st Class | Norwegian | 1 | 0.29 |
| 1st Class | Polish | 1 | 0.29 |
| 2nd Class | English | 145 | 51.06 |
| 2nd Class | Other - Multiple | 25 | 8.80 |
| 2nd Class | American | 24 | 8.45 |
| 2nd Class | Channel Islander | 12 | 4.23 |
| 2nd Class | Irish | 12 | 4.23 |
| 2nd Class | French | 11 | 3.87 |
| 2nd Class | Scottish | 8 | 2.82 |
| 2nd Class | Finnish | 6 | 2.11 |
| 2nd Class | Swedish | 6 | 2.11 |
| 2nd Class | Canadian | 5 | 1.76 |
| 2nd Class | South African | 4 | 1.41 |
| 2nd Class | Spanish | 4 | 1.41 |
| 2nd Class | Danish | 3 | 1.06 |
| 2nd Class | Italian | 3 | 1.06 |
| 2nd Class | Lithuanian | 2 | 0.70 |
| 2nd Class | Swiss | 2 | 0.70 |
| 2nd Class | Syrian/Lebanese | 2 | 0.70 |
| 2nd Class | Welsh | 2 | 0.70 |
| 2nd Class | Australian | 1 | 0.35 |
| 2nd Class | Belgian | 1 | 0.35 |
| 2nd Class | German | 1 | 0.35 |
| 2nd Class | Haitian | 1 | 0.35 |
| 2nd Class | Hungarian | 1 | 0.35 |
| 2nd Class | Japanese | 1 | 0.35 |
| 2nd Class | Norwegian | 1 | 0.35 |
| 2nd Class | Portuguese | 1 | 0.35 |
| 3rd Class | English | 112 | 15.80 |
| 3rd Class | Irish | 105 | 14.81 |
| 3rd Class | Swedish | 90 | 12.69 |
| 3rd Class | Syrian/Lebanese | 83 | 11.71 |
| 3rd Class | Other - Multiple | 69 | 9.73 |
| 3rd Class | Finnish | 52 | 7.33 |
| 3rd Class | Bulgarian | 33 | 4.65 |
| 3rd Class | Croatian | 28 | 3.95 |
| 3rd Class | Norwegian | 24 | 3.39 |
| 3rd Class | American | 23 | 3.24 |
| 3rd Class | Belgian | 22 | 3.10 |
| 3rd Class | Danish | 7 | 0.99 |
| 3rd Class | Scottish | 6 | 0.85 |
| 3rd Class | Welsh | 6 | 0.85 |
| 3rd Class | Canadian | 5 | 0.71 |
| 3rd Class | French | 5 | 0.71 |
| 3rd Class | Polish | 5 | 0.71 |
| 3rd Class | Swiss | 5 | 0.71 |
| 3rd Class | Bosnian | 4 | 0.56 |
| 3rd Class | Hong Kongese | 4 | 0.56 |
| 3rd Class | Italian | 4 | 0.56 |
| 3rd Class | Greek | 3 | 0.42 |
| 3rd Class | Channel Islander | 2 | 0.28 |
| 3rd Class | Chinese | 2 | 0.28 |
| 3rd Class | German | 2 | 0.28 |
| 3rd Class | Slovenian | 2 | 0.28 |
| 3rd Class | Australian | 1 | 0.14 |
| 3rd Class | Austrian | 1 | 0.14 |
| 3rd Class | Latvian | 1 | 0.14 |
| 3rd Class | Lithuanian | 1 | 0.14 |
| 3rd Class | Portuguese | 1 | 0.14 |
| 3rd Class | Turkish | 1 | 0.14 |
# PEER Review MV: Here, I think another good opportunity to visualize tables that intersect Nationality and class. I think you are able to more easily see variation in nationality by class. Interesting how Americans were concentrated in first class, and third class varied much more.
# dat %>%
# filter(category == "Passenger") %>%
# filter(!is.na(nationality2)) %>%
# group_by(class, nationality_cat) %>%
# summarize(count = n()) %>%
# mutate(percent = (count/sum(count))*100) %>%
# arrange(class, desc(percent)) %>%
# ggplot(aes(y = nationality_cat)) +
# geom_col(aes(x = percent), fill = "dark red") +
# facet_wrap(~fct_infreq(class)) +
# geom_text(aes(x = percent, label=round(percent,2)), hjust = -.1, size = 3) +
# theme_minimal() +
# labs(x = "Percent of passengers by class", y = "Nationality")
# Average age by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
group_by(class) %>%
summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Average Age | Minimum Age | Maximum Age |
|---|---|---|---|
| 1st Class | 39.12 | 0 | 71 |
| 2nd Class | 30.01 | 0 | 71 |
| 3rd Class | 25.12 | 0 | 74 |
# Survival rate by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived) %>%
kable(caption = "Survival Rate by Class",
col.names = c("Class", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Survived | Count | Percent |
|---|---|---|---|
| 1st Class | Lost | 123 | 37.96 |
| 1st Class | Saved | 201 | 62.04 |
| 2nd Class | Lost | 166 | 58.45 |
| 2nd Class | Saved | 118 | 41.55 |
| 3rd Class | Lost | 528 | 74.47 |
| 3rd Class | Saved | 181 | 25.53 |
# Survival rate by gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived) %>%
kable(caption = "Survival Rate by Gender",
col.names = c("Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Gender | Survived | Count | Percent |
|---|---|---|---|
| Female | Lost | 127 | 27.25 |
| Female | Saved | 339 | 72.75 |
| Male | Lost | 690 | 81.08 |
| Male | Saved | 161 | 18.92 |
# Survival rate by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender) %>%
kable(caption = "Survival Rate by Class and Gender",
col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
| Class | Gender | Survived | Count | Percent |
|---|---|---|---|---|
| 1st Class | Female | Lost | 5 | 3.47 |
| 1st Class | Female | Saved | 139 | 96.53 |
| 1st Class | Male | Lost | 118 | 65.56 |
| 1st Class | Male | Saved | 62 | 34.44 |
| 2nd Class | Female | Lost | 12 | 11.32 |
| 2nd Class | Female | Saved | 94 | 88.68 |
| 2nd Class | Male | Lost | 154 | 86.52 |
| 2nd Class | Male | Saved | 24 | 13.48 |
| 3rd Class | Female | Lost | 110 | 50.93 |
| 3rd Class | Female | Saved | 106 | 49.07 |
| 3rd Class | Male | Lost | 418 | 84.79 |
| 3rd Class | Male | Saved | 75 | 15.21 |
## PEER Review MV: Cool graphs!
surv_classhist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_agehist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
surv_ageclass_hist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
ctree <- ctree(survived ~ gender + class + age, data=datpass)
## Note: We are aware the "saved" and "lost" labels are switched in the first graph and are working to figure out why.
plot(ctree)
# PEER Review MV: I am not at all familiar with the ctree command or what it is doing but it appears to be predicting survival rates for each combination of categories as you go down the flowcharts. However, I'm not sure why class appears twice at different levels. If I follow the left most line, it makes sense that female and first class would have highest probability of save rate. But not sure why class appears twice along the same hierarchy. There must be some reasoning behind this but it will be important to clearly explain this in the final write up.
ggparty(ctree) +
geom_edge() +
geom_edge_label() +
geom_node_splitvar() +
geom_node_plot(gglist = list(geom_bar(aes(x = "", fill = survived),
position = position_fill()),
theme_bw(),
xlab("Survival"), ylab("Percentage")),
shared_axis_labels = TRUE,
legend_separator = TRUE,)
fares <- import(here("data", "avgfare.csv")) %>%
clean_names()
fares$accommodation <- as.factor(fares$accommodation)
fares$accommodation <- factor(fares$accommodation, levels = c("First-class parlor suite", "First-class cabin", "Second-class cabin", "Third-class cabin"))
## PEER Review MV: Would be great to know where these numbers are coming from, a little bit more annotation would be helpful.
p1921 <- (17.9/9.7)
fares$fare_1921 <- p1921*fares$fare_1912
fares$fare_1921 <- round(fares$fare_1921, 2)
p1931 <- (15.2/9.7)
fares$fare_1931 <- p1931*fares$fare_1912
fares$fare_1931 <- round(fares$fare_1931, 2)
p1941 <- (14.7/9.7)
fares$fare_1941 <- p1941*fares$fare_1912
fares$fare_1941 <- round(fares$fare_1941, 2)
p1951 <- (26.0/9.7)
fares$fare_1951 <- p1951*fares$fare_1912
fares$fare_1951 <- round(fares$fare_1951, 2)
p1961 <- (29.9/9.7)
fares$fare_1961 <- p1961*fares$fare_1912
fares$fare_1961 <- round(fares$fare_1961, 2)
p1971 <- (40.5/9.7)
fares$fare_1971 <- p1971*fares$fare_1912
fares$fare_1971 <- round(fares$fare_1971, 2)
p1981 <- (90.9/9.7)
fares$fare_1981 <- p1981*fares$fare_1912
fares$fare_1981 <- round(fares$fare_1981, 2)
p1991 <- (136.2/9.7)
fares$fare_1991 <- p1991*fares$fare_1912
fares$fare_1991 <- round(fares$fare_1991, 2)
p2001 <- (177.1/9.7)
fares$fare_2001 <- p2001*fares$fare_1912
fares$fare_2001 <- round(fares$fare_2001, 2)
p2011 <- (224.9/9.7)
fares$fare_2011 <- p2011*fares$fare_1912
fares$fare_2011 <- round(fares$fare_2011, 2)
p2021 <- (274.3/9.7)
fares$fare_2021 <- p2021*fares$fare_1912
fares$fare_2021 <- round(fares$fare_2021, 2)
fares_tidy <- fares %>%
pivot_longer(cols = starts_with("fare"),
names_to = "year",
names_prefix = "fare_",
values_to = "price", names_transform = list(year = as.integer))
# # PEER Review MV: Sweet graph!
fare_graph <- fares_tidy %>%
ggplot(aes(year, price, colour=accommodation)) +
geom_line() +
geom_point() +
scale_colour_brewer(palette="Spectral") +
facet_wrap(~ accommodation, 4, scales = "free") +
xlim(1912,2021) +
theme(panel.spacing = unit(1, "lines")) +
labs(y = "Price ($USD)", x = "Year", title = "Inflation-Adjusted Titanic Ticket Prices", subtitle = "From 1912 to 2021", colour = "Accommodation") +
theme_minimal()
ggplotly(fare_graph)
When taking inflation rates into consideration, we see that the average price for a first class cabin in 1912 was $150.00, which today would be $4,241.74